Link to this problem is here


In [17]:
def hamming_distance(seq1, seq2):
    assert len(seq1) == len(seq2)
    return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))

In [2]:
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
    # However, only the current and two previous rows are needed at once,
    # so we only store those.
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]

In [7]:
dameraulevenshtein(
'',
''
)


Out[7]:
405

In [13]:
zip('ATTGCT','AAGGCT')


Out[13]:
[('A', 'A'), ('T', 'A'), ('T', 'G'), ('G', 'G'), ('C', 'C'), ('T', 'T')]

In [18]:
hamming_distance(
'TAAAAATCCCGCATTCCTGTTTTGTGGGCGCCATCTTCTTTAGCCTAGGATGATAGAGGAGACGGGGGAAGTTGATATACTCGCGGCACTTTACAAGTATTTTTTGCCGTCGAAAGTCGCCACCAAAACCCTTGAACTACTAACGCCGCCCCATTGGAGTTAGTCTCCGCCACAGGTACAAGATGGTATTTCGGGAGCCGTAGACGCCCGTGTGGAAGCAAGGAGAATAAAAACGGAAAAGCTGCGCGTGGCTGATGGGTTGTATGGGTCTGCGTAGGTCGACCGATGATTGCAAAATTGCAAAAAAACGGAGTTTCATAATTCAGAATGCCGGATCAAAAAATTGGCAAGCCGGGCCCGCAAGGATGAAAAAAACAGCCTCTAAACGATGAGTCAAATTGAAGAGTCTACGCTGTGAAAGACTCCCTCTATGAGGACTCGCGACATGTTTTTAGTAACCTGGGTCTAAAACGTCCTCCCGACGGCGTTAGTTTGCAGAAACAGCAGTGGGGTTTAGTCTCCCATATCGATGAACCATATAAATCATGTCCAAAGTTATGTGGTACCTCCGGAACAAAATCGATGCGTGAAGTTGGCAATATTTAGTTTGATCACATAAGCCCTTTGAGGAACCGCAACGGTCGACGTGGCAGTTCAAAAGATTCACTTGGGGAAAAATTGCCCTGATGTGCGTCATAATCGAGGGTGGTATTCTGAACAACTAACCCTCCACGCCTGGCTAACGTATGAGTACCGTTGGTAACAGCGTCGAATCGGCTCCGCTACAAAGAAATTTTCATGACGGCACGCCACTCTGCCGTGACATCTAAGCAGAGACGTCCAAGCACTTGAGGGGACTCACTATAAGGATTGACTTGTCAATTCGGGGGAACAGCTCACCTCTTGCAACCCCAGAAAAGTTCCAACACCCACCAGAATGCTATACTGGGCTTCACTTCGGGTTGGCGCCCAGGTTTTATCCGGAGACGCTGGGGGATTA',
'TCATAATGTACGAGACAGATTTTCGCAGTTCGATCACCTTAGGGGTAGGACTCGCGTAGGGACGGGTTCAGCTATGCACCACAGTGACCTTTTGCAGTAGGTCTTGCCTTTGGAAAAAAGCTGGAAGAGGCTAAAACGGCTTGACAGGTGTCTAGTGAGTTAGCCGAAGGCCATGGTACAAGAAAGAATTTCGGAAAAAATACGACACCAGGAGGAGCCAAGTAACTTAAAGGCGGCCGAGGGGCAGAAAACCCATCAGTTTCGCGATTTTGCGTTGGGAGGCCCATTAGCCGCCGATGGCGAACAAGCGTAACAATACAAACAGTAATGCCTGACCCATCTCACGCGCGTCCAGGGCCAGAACGAGCAAAGACGCTACAGCTTAAATATTGTTCACAATCAAGTATGCCCGTTGTGTAAGACGCAATATATCGACCCCCGCGTCATGGATATTGAAACCTCCGTGCTATTACGCAACCCTAAGTGCCAAATCTACAGAAGTAGGAGAAGTCTTGCTTTTCCCGAAATTAGGATCCGTATATATCATTTCGTGGGTGATTAATTCCGAATGCAACCCAAGCCGTCTGTCCAGTTGGTACTATTGAGTTGACTCACGGAAAGCGAGCTAGGAGCCATATACTTAGTCGTTGAAACTCTACATATTCTGTGGGAAAATAAGCGGAACGATGTGAAGGATTGGAGAGTGAAGTAGTAACAACATACACCCATCCACGACTGGGTTACTTAAGGGTACCGTGCTGACAATCGTGTGGTCCGCCCCGATAAAAAAGTACCTTCAAGGCCGTCCCGCCGCGCGCCGTGGCATGTTGCCTCGGAATGTTTAGTGCTTGATTGCAAGTGGCTCGAAAAGTTTCTACCTAGTTCGAGGGTTCGGTCAGCCGCTCTATGCGCCTCGCAAGATAATATAAGCACCGTAACGATCTACTCTTCATGTTGTGTGTTTCATGCCCCCAAGTAGGTCCTTGCATGCGCTCGCTTA'
)


Out[18]:
501

This was the right answer!